date <- Sys.Date()
in_dir <- file.path("..", "results/integrated_dfs")
out_dir <- file.path("..", "results/expression_output")
options(dplyr.summarise.inform = FALSE)
d.DeKegel_top5_pairs_drug_sens_expr_annot <- read_rds(file.path(in_dir, "d.DeKegel_top5_pairs_drug_sens_expr_annot"))
d.drug_sens_all_genes_expr <- read_rds(file.path(in_dir, "d.drug_sens_all_genes_expr"))
## keep only relevant columns
d.DeKegel_top5_pairs_drug_sens_expr <- d.DeKegel_top5_pairs_drug_sens_expr_annot %>%
dplyr::select(-c(entrez_pair:prediction_rank, prediction_score:family_size,
broad_id, primary_disease, subtype_disease))
d.drug_sens_all_genes_expr <- d.drug_sens_all_genes_expr %>%
rename(log_tpm = rna_expression)
## how many gene pairs are included in the paralog dataset?
d.DeKegel_top5_pairs_drug_sens_expr %>%
distinct(sorted_gene_pair) %>%
nrow()
## [1] 367
source("shared_functions.R")
This analysis is intended to demonstrate expected results for targeted inhibitors and their target (single) genes.
## running lm for each sorted_gene_pair/compound_id group
## use broom::glance to get rsquared and pval
d.drug_sens_all_genes_expr_lm_summary <- d.drug_sens_all_genes_expr %>%
filter(!is.na(dependency)) %>%
filter(!is.na(log_tpm)) %>%
group_by(target, compound_id) %>%
group_modify(~ broom::glance(lm(dependency ~ log_tpm, data = .x))) %>%
dplyr::ungroup() %>%
rename(r_squared = r.squared, p_val = p.value)
# d.drug_sens_all_genes_expr_lm_summary
## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.drug_sens_all_genes_expr_lm_summary)
# d.stats
d.drug_sens_all_genes_expr_lm_summary <- d.stats %>%
dplyr::select(-p_val) %>%
right_join(d.drug_sens_all_genes_expr_lm_summary, by = c("target", "compound_id")) %>%
dplyr::select(target, compound_id, r_squared, p_val, fdr)
# d.drug_sens_all_genes_expr_lm_summary
## use broom::tidy to get slope
d.drug_sens_all_genes_expr_lm_slope <- d.drug_sens_all_genes_expr %>%
filter(!is.na(dependency)) %>%
filter(!is.na(log_tpm)) %>%
group_by(target, compound_id) %>%
group_modify(~ broom::tidy(lm(dependency ~ log_tpm, data = .x))) %>%
dplyr::ungroup() %>%
## extract slope of lm fit line from output
filter(term == "log_tpm") %>%
rename(slope = estimate) %>%
dplyr::select(target, compound_id, slope)
# d.drug_sens_all_genes_expr_lm_slope
## add rsquared, p-val, fdr, and slope info to main DF
d.drug_sens_all_genes_expr_lm <- d.drug_sens_all_genes_expr %>%
left_join(d.drug_sens_all_genes_expr_lm_summary, by = c("target", "compound_id")) %>%
left_join(d.drug_sens_all_genes_expr_lm_slope, by = c("target", "compound_id")) %>%
filter(!is.na(dependency)) %>%
filter(!is.na(log_tpm))
summarize_stats(d.drug_sens_all_genes_expr_lm)
## df: d.drug_sens_all_genes_expr_lm
## # target/compound pairs with
## p < 0.05: 1236
## p < 0.01: 565
## FDR < 0.1: 369
## FDR < 0.05: 275
## number of pairs with negative vs. positive slope
d.drug_sens_all_genes_expr_lm %>%
distinct(target, compound_id, .keep_all = TRUE) %>%
mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
group_by(slope_flag) %>%
summarize(n = n()) %>%
print_kbl()
| slope_flag | n |
|---|---|
| negative | 5329 |
| positive | 5614 |
## get low expression = low LFC pairs (positive slope)
d.drug_sens_all_genes_expr_lm_pos_slope <-d.drug_sens_all_genes_expr_lm %>%
filter(slope > 0)
## get high expression = low LFC pairs (negative slope)
d.drug_sens_all_genes_expr_lm_neg_slope <- d.drug_sens_all_genes_expr_lm %>%
filter(slope < 0)
## summarize significant drug/gene combos for each new DF
summarize_stats(d.drug_sens_all_genes_expr_lm_pos_slope)
## df: d.drug_sens_all_genes_expr_lm_pos_slope
## # target/compound pairs with
## p < 0.05: 541
## p < 0.01: 194
## FDR < 0.1: 105
## FDR < 0.05: 74
summarize_stats(d.drug_sens_all_genes_expr_lm_neg_slope)
## df: d.drug_sens_all_genes_expr_lm_neg_slope
## # target/compound pairs with
## p < 0.05: 695
## p < 0.01: 371
## FDR < 0.1: 264
## FDR < 0.05: 201
d.top_neg_hits <- d.drug_sens_all_genes_expr_lm_neg_slope %>%
unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
arrange(fdr) %>%
filter(fdr < 0.05) %>%
distinct(target_compound_label)
print_kbl(head(d.top_neg_hits, n = 20))
| target_compound_label |
|---|
| MDM2 idasanutlin |
| MDM2 CGM097 |
| MDM2 AMG-232 |
| ERBB2 GW-583340 |
| ERBB2 AST-1306 |
| PDE3A anagrelide |
| ERBB2 lapatinib |
| FGFR2 AEE788 |
| PDGFRA lucitanib |
| MDM2 nutlin-3 |
| ERBB2 tucatinib |
| FGFR1 erdafitinib |
| ERBB2 AZD8931 |
| ERBB2 BMS-690514 |
| PDGFRA masitinib |
| ERBB2 PD-168393 |
| PDGFRA pazopanib |
| EGFR PD-153035 |
| FGFR3 AEE788 |
| EGFR BMS-690514 |
make_lm_plot <- function(df, plot_list, x_val){
d.plot <- df %>%
unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
filter(target_compound_label %in% plot_list) %>%
mutate("target_compound_label" = reorder(target_compound_label, fdr))
d.label <- d.plot %>%
group_by(target, compound_id) %>%
mutate(group_label_x = min({{x_val}}),
group_label_y = max(dependency)) %>%
dplyr::ungroup() %>%
dplyr::select(target_compound_label, r_squared, p_val, fdr, group_label_x, group_label_y) %>%
distinct(target_compound_label, .keep_all = TRUE) %>%
mutate(r_squared = round(r_squared, 3),
p_val = scientific(p_val, 3),
fdr = scientific(fdr, 3))
# d.label
plot <- ggplot(d.plot, aes(x = {{x_val}}, y = dependency)) +
geom_hline(yintercept = 0) +
geom_point(size = 1) +
geom_smooth(method = lm, se = FALSE, color = "gray60") +
geom_richtext(data = d.label,
aes(x = group_label_x, y = group_label_y,
## this package can do HTML-formatting for text
label = paste0("r<sup>2</sup>=", r_squared, "<br>",
"FDR =", fdr)),
## adjust label alignment and remove borders
hjust = 0, vjust = 1, label.color = NA, size = 3,
## label background = transparent, text is not
fill = alpha(c("white"), 0.75),
label.margin = unit(c(0, 0, 0, 0), "lines"),
label.padding = unit(c(0, 0, 0, 0), "lines")) +
labs(x = "log2(TPM)", y = "Cell_viability") +
theme_bw() +
theme(aspect.ratio = 1,
axis.text = element_text(color = "black"),
axis.ticks = element_line(color = "black")) +
facet_wrap(~target_compound_label, scales = "free", ncol = 3)
return(plot)
}
plot_list <- c("MDM2\nidasanutlin", "ERBB2\nGW-583340", "EGFR\nPD-153035")
selected_top_lm_hits <- make_lm_plot(d.drug_sens_all_genes_expr_lm_neg_slope, plot_list, log_tpm)
selected_top_lm_hits
d.drug_sens_all_genes_expr_lm_neg_slope_topHits <- d.drug_sens_all_genes_expr_lm_neg_slope %>%
filter(fdr < 0.05) %>%
arrange(fdr) %>%
distinct(target, compound_id, .keep_all = TRUE) %>%
dplyr::select(target, compound_name, slope, r_squared:fdr, moa)
print_kbl(head(d.drug_sens_all_genes_expr_lm_neg_slope_topHits))
| target | compound_name | slope | r_squared | p_val | fdr | moa |
|---|---|---|---|---|---|---|
| MDM2 | idasanutlin | -0.6634255 | 0.2571951 | 0 | 0 | MDM inhibitor |
| MDM2 | CGM097 | -0.5986578 | 0.2523123 | 0 | 0 | MDM inhibitor |
| MDM2 | AMG-232 | -0.5354975 | 0.2197390 | 0 | 0 | MDM inhibitor |
| ERBB2 | GW-583340 | -0.1913803 | 0.1504336 | 0 | 0 | EGFR inhibitor |
| ERBB2 | AST-1306 | -0.1943274 | 0.1445403 | 0 | 0 | EGFR inhibitor |
| PDE3A | anagrelide | -0.2086442 | 0.1393173 | 0 | 0 | phosphodiesterase inhibitor |
d.drug_sens_all_genes_expr_lm_pos_slope_fdr_topHits <- d.drug_sens_all_genes_expr_lm_pos_slope %>%
filter(fdr < 0.05) %>%
arrange(fdr) %>%
distinct(target, compound_id, .keep_all = TRUE) %>%
dplyr::select(target, compound_name, slope, r_squared:fdr, moa)
print_kbl(head(d.drug_sens_all_genes_expr_lm_pos_slope_fdr_topHits))
| target | compound_name | slope | r_squared | p_val | fdr | moa |
|---|---|---|---|---|---|---|
| AKT3 | GSK2110183 | 0.1494385 | 0.0982682 | 0 | 0.0e+00 | AKT inhibitor |
| AKT3 | MK-2206 | 0.1194117 | 0.0873418 | 0 | 0.0e+00 | AKT inhibitor |
| NQO1 | menadione | 0.1318066 | 0.0801592 | 0 | 0.0e+00 | mitochondrial DNA polymerase inhibitor, phosphatase inhibitor |
| PIK3CD | alpelisib | 0.1273025 | 0.0629412 | 0 | 4.0e-07 | PI3K inhibitor |
| AKT3 | GDC-0068 | 0.1148292 | 0.0607472 | 0 | 7.0e-07 | AKT inhibitor |
| ATP1A1 | k-strophanthidin | 0.3351501 | 0.0587694 | 0 | 1.6e-06 | ATPase inhibitor |
Since many target genes do not show a truly linear pattern of expression vs. cell viability, and in some cases the linear regression fit line seemed to be strongly influenced by outliers, we decided to take another approach to model these data. We instead used an outlier-based approach to group cell lines into low, high, and “neither” expression categories.
## robustly scale expression data
d.drug_sens_all_genes_expr_outlier <- d.drug_sens_all_genes_expr %>%
filter(!is.na(log_tpm)) %>%
filter(!is.na(dependency)) %>%
group_by(target, compound_id) %>%
mutate(scaled_log_tpm = RobScale(log_tpm, center = TRUE, scale = TRUE)) %>%
mutate(expr_outlier_flag = case_when(
scaled_log_tpm < -2 ~ "low",
scaled_log_tpm > 2 ~ "high",
TRUE ~ "neither")) %>%
mutate(expr_low_flag = case_when(
scaled_log_tpm < -2 ~ TRUE,
TRUE ~ FALSE)) %>% ## if expression ≥ -2, classify as "not low"
ungroup() %>%
mutate(expr_outlier_flag = factor(expr_outlier_flag, levels = c("low", "neither", "high")))
## how many target/compound pairs have no expression outliers?
d.drug_sens_all_genes_expr_outlier %>%
group_by(target, compound_id) %>%
distinct(expr_low_flag) %>%
summarize(n = n()) %>%
filter(n < 2) %>%
nrow() %>%
print_kbl()
| x |
|---|
| 8236 |
d.drug_sens_all_genes_expr_outlier %>%
group_by(target, compound_id, expr_outlier_flag) %>%
summarize(n = n()) %>%
group_by(expr_outlier_flag) %>%
summarize(mean = mean(n),
median = median(n)) %>%
print_kbl()
| expr_outlier_flag | mean | median |
|---|---|---|
| low | 24.69228 | 21 |
| neither | 451.99305 | 455 |
| high | 95.56378 | 99 |
## how many compounds/targets have 0 high or low outlier cell lines?
d.drug_sens_all_genes_expr_outlier %>%
group_by(target, compound_id, expr_outlier_flag) %>%
summarize(n = n()) %>%
ungroup() %>%
group_by(target, compound_id) %>%
summarize(n = n()) %>%
filter(n < 3) %>%
nrow() %>%
print_kbl()
| x |
|---|
| 8276 |
## low vs. neither (with high included but ignoring it) =>
## calculate diff in median LFC between relevant groups
d.drug_sens_all_genes_expr_outlier_low <- d.drug_sens_all_genes_expr_outlier %>%
group_by(target, compound_id) %>%
## get rid of drug/targets that don't have any "low" cell lines
filter(any(expr_outlier_flag == "low")) %>%
ungroup()
d.drug_sens_all_genes_expr_outlier_low_grp_med_diff <- d.drug_sens_all_genes_expr_outlier_low %>%
group_by(target, compound_id, expr_outlier_flag) %>%
## calculate median dependency score for each outlier group
summarize(grp_med_dependency = median(dependency)) %>%
pivot_wider(names_from = expr_outlier_flag, values_from = grp_med_dependency) %>%
## calculate difference between low and neither groups
mutate(grp_med_diff = low - neither) %>%
dplyr::select(target, compound_id, grp_med_diff)
d.drug_sens_all_genes_expr_outlier_low <- d.drug_sens_all_genes_expr_outlier_low %>%
left_join(d.drug_sens_all_genes_expr_outlier_low_grp_med_diff, by = c("target", "compound_id"))
## high vs. neither (with low included but ignoring it) =>
## calculate diff in median LFC between relevant groups =>
d.drug_sens_all_genes_expr_outlier_high <- d.drug_sens_all_genes_expr_outlier %>%
group_by(target, compound_id) %>%
## get rid of drug/targets that don't have any "high" cell lines
filter(any(expr_outlier_flag == "high")) %>%
ungroup()
d.drug_sens_all_genes_expr_outlier_high_grp_med_diff <- d.drug_sens_all_genes_expr_outlier_high %>%
group_by(target, compound_id, expr_outlier_flag) %>%
## calculate median dependency score for each outlier group
summarize(grp_med_dependency = median(dependency)) %>%
pivot_wider(names_from = expr_outlier_flag, values_from = grp_med_dependency) %>%
## calculate difference between low and neither groups
mutate(grp_med_diff = high - neither) %>%
dplyr::select(target, compound_id, grp_med_diff)
d.drug_sens_all_genes_expr_outlier_high <- d.drug_sens_all_genes_expr_outlier_high %>%
left_join(d.drug_sens_all_genes_expr_outlier_high_grp_med_diff, by = c("target", "compound_id"))
## low outliers and drug sensitivity
## calculate p-value (adjusted for multiple within-group tests?)
d.drug_sens_all_genes_expr_outlier_low_wrst_pvals <- d.drug_sens_all_genes_expr_outlier_low %>%
## remove cell lines that are high outliers
filter(expr_outlier_flag != "high") %>%
group_by(target, compound_id) %>%
summarize(p_val = wilcox.test(dependency ~ expr_outlier_flag)$p.value)
# d.drug_sens_all_genes_expr_outlier_low_wrst_sens_pvals
## adjust for multiple testing overall
d.stats <- BH_adjust(d.drug_sens_all_genes_expr_outlier_low_wrst_pvals)
## Adding missing grouping variables: `target`
# d.stats
d.drug_sens_all_genes_expr_outlier_low_wrst <- d.drug_sens_all_genes_expr_outlier_low %>%
left_join(d.stats, by = c("target", "compound_id"))
d.drug_sens_all_genes_expr_outlier_low_wrst_sens <- d.drug_sens_all_genes_expr_outlier_low_wrst %>%
filter(grp_med_diff < 0)
d.drug_sens_all_genes_expr_outlier_low_wrst_resist <- d.drug_sens_all_genes_expr_outlier_low_wrst %>%
filter(grp_med_diff > 0)
# d.drug_sens_all_genes_expr_outlier_low_wrst_resist
## high outliers and drug sensitivity
d.drug_sens_all_genes_expr_outlier_high_wrst_sens_pvals <- d.drug_sens_all_genes_expr_outlier_high %>%
## remove cell lines that are low outliers
filter(expr_outlier_flag != "low") %>%
group_by(target, compound_id) %>%
summarize(p_val = wilcox.test(dependency ~ expr_outlier_flag)$p.value)
# d.drug_sens_all_genes_expr_outlier_high_wrst_sens_pvals
## adjust for multiple testing overall
d.stats <- BH_adjust(d.drug_sens_all_genes_expr_outlier_high_wrst_sens_pvals)
## Adding missing grouping variables: `target`
# d.stats
d.drug_sens_all_genes_expr_outlier_high_wrst <- d.drug_sens_all_genes_expr_outlier_high %>%
left_join(d.stats, by = c("target", "compound_id"))
# d.drug_sens_all_genes_expr_outlier_high_wrst_sens
## high outliers and drug sensitivity
d.drug_sens_all_genes_expr_outlier_high_wrst_sens <- d.drug_sens_all_genes_expr_outlier_high_wrst %>%
## remove cell lines where high outliers don't have a higher median dependency score than the other groups
filter(grp_med_diff < 0)
d.drug_sens_all_genes_expr_outlier_high_wrst_resist <- d.drug_sens_all_genes_expr_outlier_high_wrst %>%
## remove cell lines where low outliers don't have a lower median dependency score than the other groups
filter(grp_med_diff > 0)
## low outliers
summarize_stats(d.drug_sens_all_genes_expr_outlier_low_wrst_sens)
## df: d.drug_sens_all_genes_expr_outlier_low_wrst_sens
## # target/compound pairs with
## p < 0.05: 89
## p < 0.01: 31
## FDR < 0.1: 4
## FDR < 0.05: 3
summarize_stats(d.drug_sens_all_genes_expr_outlier_low_wrst_resist)
## df: d.drug_sens_all_genes_expr_outlier_low_wrst_resist
## # target/compound pairs with
## p < 0.05: 189
## p < 0.01: 70
## FDR < 0.1: 26
## FDR < 0.05: 21
## high outliers
summarize_stats(d.drug_sens_all_genes_expr_outlier_high_wrst_sens)
## df: d.drug_sens_all_genes_expr_outlier_high_wrst_sens
## # target/compound pairs with
## p < 0.05: 423
## p < 0.01: 145
## FDR < 0.1: 33
## FDR < 0.05: 24
summarize_stats(d.drug_sens_all_genes_expr_outlier_high_wrst_resist)
## df: d.drug_sens_all_genes_expr_outlier_high_wrst_resist
## # target/compound pairs with
## p < 0.05: 417
## p < 0.01: 101
## FDR < 0.1: 5
## FDR < 0.05: 5
make_outlier_plot <- function(df, plot_list, direction, outlier_var, expr_var){
d.plot <- df %>%
unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
filter(target_compound_label %in% plot_list) %>%
mutate("target_compound_label" = reorder(target_compound_label, fdr))
d.label <- d.plot %>%
group_by(target, compound_id) %>%
mutate(y.position = (max(dependency) + 0.1*(max(dependency) - min(dependency)))) %>%
dplyr::ungroup() %>%
dplyr::select(target_compound_label, fdr, y.position) %>%
distinct(target_compound_label, .keep_all = TRUE) %>%
mutate(group1 = direction,
group2 = "neither",
fdr = scientific(fdr, 3))
# d.label
p <- ggplot(d.plot, aes(x = {{outlier_var}}, y = dependency)) +
geom_hline(yintercept = 0) +
geom_jitter(aes(color = {{outlier_var}}), width = 0.3, alpha = 0.6) +
geom_boxplot(aes(fill = {{outlier_var}}), outlier.shape = NA, coef = 0) +
stat_pvalue_manual(d.label, label = paste0("FDR = {fdr}"), tip.length = 0.03, size = 3, vjust = -0.5) +
## adds extra space on top & bottom so p-value label will fit
scale_y_continuous(expand = expansion(mult = 0.15)) +
# scale_color_manual(values = c("low" = "blue", "neither" = "gray50", "high" = "red")) +
scale_color_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
scale_fill_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
labs(x = "Cell_viability",
y = "Expression_group") +
theme_bw() +
theme(aspect.ratio = 0.75,
axis.text = element_text(color = "black"),
axis.ticks = element_line(color = "black"),
legend.position = "none") +
facet_wrap(~target_compound_label, scales = "free_y",
ncol = 1)
q <- ggplot(d.plot, aes(x = target, y = {{expr_var}})) +
geom_jitter(aes(color = {{outlier_var}}), width = 0.3) +
geom_boxplot(alpha = 0.3, outlier.shape = NA, coef = 0) +
# scale_color_manual(values = c("low" = "blue", "neither" = "gray50", "high" = "red")) +
scale_color_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
labs(x = "Target", y = "log2(TPM)") +
theme_bw() +
theme(aspect.ratio = 1,
axis.text = element_text(color = "black"),
axis.ticks = element_line(color = "black")) +
facet_wrap(~target_compound_label, scale = "free", ncol = 1)
ggarrange(p, q, ncol = 2, align = "hv",
common.legend = TRUE, legend = "right")
}
make_outlier_plot(d.drug_sens_all_genes_expr_outlier_low_wrst_resist,
c("MDM2\nidasanutlin", "ERBB2\nGW-583340", "EGFR\nPD-153035"), "low", expr_outlier_flag, log_tpm)
Compare this output to that from the linear model-based approach:
selected_top_lm_hits
## `geom_smooth()` using formula 'y ~ x'
What does the expression of each target gene look like across cell lineages?
make_lineage_plot <- function(df, plot_list, y_var, color_flag){
d.plot <- df %>%
ungroup() %>%
unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
filter(target_compound_label %in% plot_list) %>%
mutate("target_compound_label" = reorder(target_compound_label, fdr),
"target" = reorder(target, fdr))
ggplot(d.plot, aes(x = lineage, y = {{y_var}})) +
geom_jitter(aes(color = {{color_flag}}), width = 0.3, size = 1) +
geom_boxplot(alpha = 0, outlier.shape = NA, coef = 0) +
scale_color_manual(values = c("low" = "#0080C6", "neither" = "#7F7F7F", "high" = "#B84250")) +
labs(x = "Cell_lineage", y = "log2(TPM)") +
theme_bw() +
theme(axis.text = element_text(color = "black"),
axis.ticks = element_line(color = "black"),
axis.text.x = element_text(angle = 45, hjust=1)) +
facet_wrap(~target, scales = "free_y",
ncol = 1)
}
make_lineage_plot(d.drug_sens_all_genes_expr_outlier_low_wrst_resist,
c("MDM2\nidasanutlin", "ERBB2\nGW-583340", "EGFR\nPD-153035"), log_tpm, expr_outlier_flag)
## add target col so single-gene functions will work
d.DeKegel_top5_pairs_drug_sens_expr <- d.DeKegel_top5_pairs_drug_sens_expr %>%
mutate(target = sorted_gene_pair)
# d.DeKegel_top5_pairs_drug_sens_expr
## A1 expr
## use broom::glance to get rsquared and pval
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary <- d.DeKegel_top5_pairs_drug_sens_expr %>%
filter(!is.na(dependency)) %>%
filter(!is.na(A1_log_tpm)) %>%
group_by(target, compound_id) %>%
group_modify(~ broom::glance(lm(dependency ~ A1_log_tpm, data = .x))) %>%
dplyr::ungroup() %>%
rename(r_squared = r.squared, p_val = p.value)
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary
## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary)
# d.stats
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary <- d.stats %>%
dplyr::select(-p_val) %>%
right_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary, by = c("target", "compound_id")) %>%
dplyr::select(target, compound_id, r_squared, p_val, fdr)
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary
## use broom::tidy to get slope
## note: group_modify expects a DF output
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope <- d.DeKegel_top5_pairs_drug_sens_expr %>%
filter(!is.na(dependency)) %>%
filter(!is.na(A1_log_tpm)) %>%
group_by(target, compound_id) %>%
group_modify(~ broom::tidy(lm(dependency ~ A1_log_tpm, data = .x))) %>%
dplyr::ungroup() %>%
## extract slope of lm fit line from output
filter(term == "A1_log_tpm") %>%
rename(slope = estimate) %>%
dplyr::select(target, compound_id, slope)
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope
## add rsquared, p-val, fdr, and slope info to main DF
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm <- d.DeKegel_top5_pairs_drug_sens_expr %>%
left_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_summary, by = c("target", "compound_id")) %>%
left_join(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_slope, by = c("target", "compound_id")) %>%
filter(!is.na(dependency)) %>%
filter(!is.na(A1_log_tpm))
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm
## # target/compound pairs with
## p < 0.05: 481
## p < 0.01: 249
## FDR < 0.1: 251
## FDR < 0.05: 142
## A2 expr
## use broom::glance to get rsquared and pval
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary <- d.DeKegel_top5_pairs_drug_sens_expr %>%
filter(!is.na(dependency)) %>%
filter(!is.na(A2_log_tpm)) %>%
group_by(target, compound_id) %>%
group_modify(~ broom::glance(lm(dependency ~ A2_log_tpm, data = .x))) %>%
dplyr::ungroup() %>%
rename(r_squared = r.squared, p_val = p.value)
# d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary
## FDR-adjust p-vals for multiple testing and add back to summary DF
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary)
# d.stats
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary <- d.stats %>%
dplyr::select(-p_val) %>%
right_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary, by = c("target", "compound_id")) %>%
dplyr::select(target, compound_id, r_squared, p_val, fdr)
# d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary
## use broom::tidy to get slope
## note: group_modify expects a DF output
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope <- d.DeKegel_top5_pairs_drug_sens_expr %>%
filter(!is.na(dependency)) %>%
filter(!is.na(A2_log_tpm)) %>%
group_by(target, compound_id) %>%
group_modify(~ broom::tidy(lm(dependency ~ A2_log_tpm, data = .x))) %>%
dplyr::ungroup() %>%
## extract slope of lm fit line from output
filter(term == "A2_log_tpm") %>%
rename(slope = estimate) %>%
dplyr::select(target, compound_id, slope)
# d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope
## add rsquared, p-val, fdr, and slope info to main DF
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm <- d.DeKegel_top5_pairs_drug_sens_expr %>%
left_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_summary, by = c("target", "compound_id")) %>%
left_join(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_slope, by = c("target", "compound_id")) %>%
filter(!is.na(dependency)) %>%
filter(!is.na(A2_log_tpm))
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm
## # target/compound pairs with
## p < 0.05: 340
## p < 0.01: 156
## FDR < 0.1: 128
## FDR < 0.05: 74
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
distinct(target, compound_id, .keep_all = TRUE) %>%
mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
group_by(slope_flag) %>%
summarize(n = n()) %>%
print_kbl()
| slope_flag | n |
|---|---|
| negative | 1343 |
| positive | 1039 |
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
distinct(target, compound_id, .keep_all = TRUE) %>%
mutate(slope_flag = ifelse(slope < 0, "negative", "positive")) %>%
group_by(slope_flag) %>%
summarize(n = n()) %>%
print_kbl()
| slope_flag | n |
|---|---|
| negative | 1222 |
| positive | 1160 |
## get low expression = low LFC pairs (positive slope)
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope <-d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
filter(slope > 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope
## # target/compound pairs with
## p < 0.05: 120
## p < 0.01: 48
## FDR < 0.1: 48
## FDR < 0.05: 29
## get high expression = low LFC pairs (negative slope)
d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope <- d.DeKegel_top5_pairs_drug_sens_A1_expr_lm %>%
filter(slope < 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope
## # target/compound pairs with
## p < 0.05: 361
## p < 0.01: 201
## FDR < 0.1: 203
## FDR < 0.05: 113
## get low expression = low LFC pairs (positive slope)
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope <-d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
filter(slope > 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope
## # target/compound pairs with
## p < 0.05: 158
## p < 0.01: 82
## FDR < 0.1: 67
## FDR < 0.05: 39
## get high expression = low LFC pairs (negative slope)
d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope <- d.DeKegel_top5_pairs_drug_sens_A2_expr_lm %>%
filter(slope < 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope)
## df: d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope
## # target/compound pairs with
## p < 0.05: 182
## p < 0.01: 74
## FDR < 0.1: 61
## FDR < 0.05: 35
A1_top_hits <- d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope %>%
dplyr::select(target, compound_name, r_squared, fdr) %>%
distinct(target, compound_name, .keep_all = TRUE) %>%
arrange(fdr)
print_kbl(head(A1_top_hits, n = 20))
| target | compound_name | r_squared | fdr |
|---|---|---|---|
| FYN_SRC | vandetanib | 0.0659658 | 0.0000014 |
| FYN_SRC | saracatinib | 0.0572548 | 0.0000035 |
| ATP1A1_ATP1A2 | k-strophanthidin | 0.0587694 | 0.0000035 |
| ATP1A1_ATP1A3 | k-strophanthidin | 0.0587694 | 0.0000035 |
| ATP1A1_ATP1A4 | k-strophanthidin | 0.0587694 | 0.0000035 |
| FYN_YES1 | saracatinib | 0.0572548 | 0.0000035 |
| FYN_SRC | PD-168393 | 0.0534604 | 0.0000095 |
| ARAF_RAF1 | AZ-628 | 0.0376702 | 0.0004196 |
| MAPK11_MAPK14 | tyrphostin-AG-1478 | 0.0336892 | 0.0008928 |
| FYN_SRC | bosutinib | 0.0326309 | 0.0011196 |
| FYN_YES1 | bosutinib | 0.0326309 | 0.0011196 |
| MAP2K1_MAP2K2 | arctigenin | 0.0301968 | 0.0024172 |
| CDK2_CDK3 | NU6027 | 0.0258299 | 0.0066929 |
| CDK2_CDK5 | NU6027 | 0.0258299 | 0.0066929 |
| EPAS1_HIF1A | BAY-87-2243 | 0.0245436 | 0.0097530 |
| EPAS1_HIF1A | 2-methoxyestradiol | 0.0239446 | 0.0107025 |
| NR1I2_VDR | erlotinib | 0.0219698 | 0.0137919 |
| CAMK2D_CAMK2G | bosutinib | 0.0215397 | 0.0151139 |
| ESR1_ESR2 | ethynodiol-diacetate | 0.0209548 | 0.0180253 |
| GSK3A_GSK3B | SB-203580 | 0.0209268 | 0.0181552 |
A1_top_hits_plot_list <- A1_top_hits %>%
arrange(fdr) %>%
unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
head(n = 9) %>%
pull(target_compound_label)
# A1_top_hits_plot_list
make_lm_plot(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope, A1_top_hits_plot_list, A1_log_tpm)
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope
A2_top_hits <- d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope %>%
dplyr::select(target, compound_name, r_squared, fdr) %>%
distinct(target, compound_name, .keep_all = TRUE) %>%
arrange(fdr)
print_kbl(head(A2_top_hits, n = 20))
| target | compound_name | r_squared | fdr |
|---|---|---|---|
| AKT1_AKT3 | GSK2110183 | 0.0982682 | 0.0000000 |
| AKT2_AKT3 | GSK2110183 | 0.0982682 | 0.0000000 |
| AKT1_AKT3 | MK-2206 | 0.0873418 | 0.0000000 |
| AKT2_AKT3 | MK-2206 | 0.0873418 | 0.0000000 |
| AKT1_AKT3 | GDC-0068 | 0.0607472 | 0.0000013 |
| AKT2_AKT3 | GDC-0068 | 0.0607472 | 0.0000013 |
| MAP2K1_MAP2K2 | bosutinib | 0.0474871 | 0.0000464 |
| AKT1_AKT3 | AZD5363 | 0.0474116 | 0.0000464 |
| AKT2_AKT3 | AZD5363 | 0.0474116 | 0.0000464 |
| ABL1_ABL2 | saracatinib | 0.0444061 | 0.0000956 |
| PIK3R1_PIK3R2 | CUDC-907 | 0.0410053 | 0.0003328 |
| CDK2_CDK5 | bosutinib | 0.0371005 | 0.0006195 |
| PTPN11_PTPN6 | BVT-948 | 0.0264080 | 0.0107209 |
| CDK1_CDK2 | NU6027 | 0.0258299 | 0.0111549 |
| HDAC4_HDAC5 | romidepsin | 0.0246434 | 0.0157450 |
| EPAS1_HIF1A | BAY-87-2243 | 0.0243860 | 0.0160324 |
| ROCK1_ROCK2 | SB-203580 | 0.0236855 | 0.0187571 |
| MAP3K2_MAP3K3 | bosutinib | 0.0233228 | 0.0195743 |
| AKT1_AKT2 | A-674563 | 0.0221977 | 0.0259942 |
| PPP3CA_PPP3CB | cyclosporin-a | 0.0240152 | 0.0260890 |
A2_top_hits_plot_list <- A2_top_hits %>%
arrange(fdr) %>%
unite("target_compound_label", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
head(n = 9) %>%
pull(target_compound_label)
# A1_top_hits_plot_list
make_lm_plot(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope, A2_top_hits_plot_list, A2_log_tpm)
# d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope
# write_paralog_lm_output_df <- function(df, col_name, stat_cutoff){
#
# df_out <- df %>%
# filter({{col_name}} < stat_cutoff) %>%
# arrange({{col_name}}) %>%
# distinct(target, compound_id, .keep_all = TRUE) %>%
# dplyr::select(target, compound_name, r_squared:slope, moa)
#
# df_name <- deparse(substitute(df))
# stat_name <- deparse(substitute(col_name))
#
# # df_name <- paste0(deparse(substitute(df)), "_",
# # deparse(substitute(col_name)), "_",
# # "L", stat_cutoff, ".txt")
# # print(df_name)
# # print(stat_name)
#
# # print(nrow(df_out))
#
# out_path <- file.path(out_dir, paste0(df_name, "_", stat_name, "_", "L", stat_cutoff, ".txt"))
# # print(out_path)
#
# write_tsv(df_out, out_path)
#
# cat("\ndf written to", out_path)
# }
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_pos_slope,
# fdr, 0.1)
#
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_pos_slope,
# fdr, 0.1)
#
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A1_expr_lm_neg_slope,
# fdr, 0.1)
#
# write_paralog_lm_output_df(d.DeKegel_top5_pairs_drug_sens_A2_expr_lm_neg_slope,
# fdr, 0.1)
## A2 top hits
target_list <- c("AKT1_AKT2", "AKT2_AKT3", "AKT1_AKT3", "PTPN11_PTPN6", "CDK2_CDK5", "CDK1_CDK2", "MAP2K1_MAP2K2")
compound_list <- c("GSK2110183", "MK−2206", "GDC−0068", "AZD5363", "BVT_948", "bosutinib", "NU6027", "canertinib")
## robustly scale expression data
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered <- d.DeKegel_top5_pairs_drug_sens_expr %>%
filter(target %in% target_list) %>%
filter(grepl(str_c(compound_list, collapse = "|"), compound_name)) %>%
filter(!is.na(A2_log_tpm) | !is.na(dependency)) %>%
group_by(target, compound_id) %>%
mutate(scaled_A2_log_tpm = RobScale(A2_log_tpm, center = TRUE, scale = TRUE)) %>%
mutate(A2_expr_outlier_flag = case_when(
scaled_A2_log_tpm < -1.5 ~ "low",
scaled_A2_log_tpm > 1.5 ~ "high",
TRUE ~ "neither")) %>%
mutate(scaled_A1_log_tpm = RobScale(A1_log_tpm, center = TRUE, scale = TRUE)) %>%
mutate(A1_expr_outlier_flag = case_when(
scaled_A1_log_tpm < -1.5 ~ "low",
scaled_A1_log_tpm > 1.5 ~ "high",
TRUE ~ "neither")) %>%
ungroup() %>%
mutate(A2_expr_outlier_flag = factor(A2_expr_outlier_flag, levels = c("low", "neither", "high")))%>%
mutate(A1_expr_outlier_flag = factor(A1_expr_outlier_flag, levels = c("low", "neither", "high")))
# d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_unfiltered %>%
group_by(target, compound_id) %>%
## get rid of drug/targets that don't have any "low" cell lines
filter(any(A2_expr_outlier_flag == "low")) %>%
ungroup()
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_grp_med_diff <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
group_by(target, compound_id, A2_expr_outlier_flag) %>%
## calculate median dependency score for each outlier group
summarize(grp_med_dependency = median(dependency)) %>%
pivot_wider(names_from = A2_expr_outlier_flag, values_from = grp_med_dependency) %>%
## calculate difference between low and neither groups
mutate(grp_med_diff = low - neither) %>%
dplyr::select(target, compound_id, grp_med_diff)
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
left_join(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_grp_med_diff, by = c("target", "compound_id"))
## low outliers and drug sensitivity
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_pvals <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
filter(A2_expr_outlier_flag != "high") %>%
group_by(target, compound_id) %>%
summarize(p_val = wilcox.test(dependency ~ A2_expr_outlier_flag)$p.value)
# d.drug_sens_all_genes_expr_outlier_low_wrst_sens_pvals
## adjust for multiple testing overall
d.stats <- BH_adjust(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_pvals)
## Adding missing grouping variables: `target`
# d.stats
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers %>%
left_join(d.stats, by = c("target", "compound_id"))
## filter for hits where low-expressing cell lines are more sensitive to the drug
## (low group has lower median than neither group)
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst %>%
filter(grp_med_diff < 0)
## filter for hits where low-expressing ell lines are more resistant to the drug
## (low group has higher median than neither group)
d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst %>%
filter(grp_med_diff > 0)
summarize_stats(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens)
## df: d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens
## # target/compound pairs with
## p < 0.05: 5
## p < 0.01: 5
## FDR < 0.1: 5
## FDR < 0.05: 5
summarize_stats(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist)
## df: d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_resist
## # target/compound pairs with
## p < 0.05: 0
## p < 0.01: 0
## FDR < 0.1: 0
## FDR < 0.05: 0
# d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens
plot_list <- d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens %>%
unite("target_compound_id", c("target", "compound_name"), sep = "\n", remove = FALSE) %>%
distinct(target_compound_id) %>%
pull(target_compound_id)
make_outlier_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
plot_list[1:4], "low", A2_expr_outlier_flag, A2_log_tpm)
make_outlier_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
plot_list[5:8], "low", A2_expr_outlier_flag, A2_log_tpm)
make_lineage_plot(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
plot_list, A2_log_tpm, A2_expr_outlier_flag)
write_rds(d.DeKegel_top5_pairs_drug_sens_expr_A2_lm_pos_top_outliers_wrst_sens,
file.path(out_dir, "d.top_outlier_pairs"))